We’ll be running through some basic statistical analysis of the data we have generated - this will be broken up into a few main parts
Please note this is not an exhaustive list of the analysis you could do with your data, but some good starting points for which to jump off from.
Before we can analyse our data, we need to import and format the files we created during the bioinformatic analysis, as well as the metadata file. We will also need to load in the necessary R libraries.
## LOAD PACKAGES ##
#install.packages('tidyverse', 'viridis', 'indicspecies', 'vegan', 'glue', 'reshape2','leaflet','htmltools','RColorBrewer')
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
##Phyloseq installer
#The following initializes usage of Bioc devel
#BiocManager::install(version='devel')
#BiocManager::install("phyloseq")
#BiocManager::install("microbiome")
#iNEXT 3D installer
#install.packages('devtools')
library(devtools)
#install_github('AnneChao/iNEXT.3D')
library('tidyverse')
library('viridis')
library('indicspecies')
library('vegan')
library('glue')
library('reshape2')
library('phyloseq')
library('iNEXT.3D')
library('ggplot2')
library('microbiome')
## FORMAT DATA ##
setwd("~/Downloads/Statistics_GO_eDNAhub-main/")
#set to yours
## read in the data and metadata
df <- read.delim('zotu_table.txt', check.names = FALSE, row.names = 1)
tax <- read.delim('taxonomy_file.txt', check.names = FALSE, row.names = 1, header = T)
meta <- read.csv('meta_data.csv', check.names = FALSE, row.names = 1)
A helpful R package that might be good to help sort, pre-process and
visualise your data is Phyloseq. It was originally made for
microbiome metabarcoding data, but does just as well when using eDNA
data. Just keep in mind, because of the nature of eDNA, for a lot of the
analysis steps, you will need to use presence-absence transformed
data.
#First, we need to make sure that our taxonomy file is set up as a matrix
tax_mat<-as.matrix(tax)
#Set up files into file structure
otu <- otu_table(df, taxa_are_rows = TRUE)
taxa <- tax_table(tax_mat)
sample <- sample_data(meta)
#Bring all files together into a phyloseq object
FisheDNA<-phyloseq(otu, taxa, sample)
FisheDNA
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 97 taxa and 59 samples ]
## sample_data() Sample Data: [ 59 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 97 taxa by 8 taxonomic ranks ]
#Lets explore our data using phyloseq
sample_names(FisheDNA)[1:5]
## [1] "AM101" "AM102" "AM104" "AM105" "AM11"
rank_names(FisheDNA)
## [1] "root" "kingdom" "phylum" "class" "order" "family" "genus"
## [8] "species"
otu_table(FisheDNA)[1:5, 1:5]
## OTU Table: [5 taxa and 5 samples]
## taxa are rows
## AM101 AM102 AM104 AM105 AM11
## zotu.1 9152 0 0 9092 17795
## zotu.10 0 0 0 0 0
## zotu.11 2 0 0 0 528
## zotu.12 0 0 0 0 1
## zotu.13 736 0 0 5 0
tax_table(FisheDNA)[1:5, 1:4]
## Taxonomy Table: [5 taxa by 4 taxonomic ranks]:
## root kingdom phylum class
## zotu.1 "Root" "Eukaryota" "Chordata" "Actinopteri"
## zotu.10 "Root" "Eukaryota" "Chordata" "Actinopteri"
## zotu.11 "Root" "Eukaryota" "Chordata" "Actinopteri"
## zotu.12 "Root" "Eukaryota" "Chordata" "Actinopteri"
## zotu.13 "Root" "Eukaryota" "Chordata" "Actinopteri"
taxa_names(FisheDNA)[1:10]
## [1] "zotu.1" "zotu.10" "zotu.11" "zotu.12" "zotu.13" "zotu.14" "zotu.15"
## [8] "zotu.16" "zotu.17" "zotu.18"
#Preprocess this data (examples)
FisheDNA.1 = subset_taxa(FisheDNA, class=="Actinopteri") #Subset taxa belonging to the class Actinopteri
FisheDNA.2 = prune_samples(sample_sums(FisheDNA)>=20, FisheDNA) #Remove samples with less than 20 total reads.
FisheDNA.3= filter_taxa(FisheDNA, function(x) sum(x > 3) > (0.2*length(x)), TRUE) #Remove taxa not seen more than 3 times in at least 20% of the samples.
#Standardise abundances to the median sequencing depth
total = median(sample_sums(FisheDNA))
standf = function(x, t=total) round(t * (x / sum(x)))
FisheDNA.4= transform_sample_counts(FisheDNA, standf)
FisheDNA #Original data frame
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 97 taxa and 59 samples ]
## sample_data() Sample Data: [ 59 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 97 taxa by 8 taxonomic ranks ]
FisheDNA.1 #Subsetting taxa to Actinopteri
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 63 taxa and 59 samples ]
## sample_data() Sample Data: [ 59 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 63 taxa by 8 taxonomic ranks ]
FisheDNA.2 #Removing samples with less than 20 reads
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 97 taxa and 59 samples ]
## sample_data() Sample Data: [ 59 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 97 taxa by 8 taxonomic ranks ]
FisheDNA.3 #Remove taxa not seen more than 3 times in at least 20% of the samples.
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 10 taxa and 59 samples ]
## sample_data() Sample Data: [ 59 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 10 taxa by 8 taxonomic ranks ]
FisheDNA.4 #Standardised abundance
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 97 taxa and 59 samples ]
## sample_data() Sample Data: [ 59 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 97 taxa by 8 taxonomic ranks ]
otu_table(FisheDNA.4)[1:5, 1:5]
## OTU Table: [5 taxa and 5 samples]
## taxa are rows
## AM101 AM102 AM104 AM105 AM11
## zotu.1 7406 0 0 9756 15130
## zotu.10 0 0 0 0 0
## zotu.11 2 0 0 0 449
## zotu.12 0 0 0 0 1
## zotu.13 596 0 0 5 0
#Okay, now lets preprocess this data for our current analysis
FisheDNA.5 <- filter_taxa(FisheDNA, function(x){sum(x > 0) > 1}, prune=TRUE) #removing singletons
FisheDNA.5
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 71 taxa and 59 samples ]
## sample_data() Sample Data: [ 59 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 71 taxa by 8 taxonomic ranks ]
#transform into a presence-absence dataset
Fish.eDNA.pa <- microbiome::transform(FisheDNA.5, 'pa')
#abundance of reads across Sample Number
p = plot_bar(FisheDNA.5, "SampleNo", fill="species")
p + geom_bar(aes(color=species, fill=species), stat="identity", position="stack") +
ggtitle("Read abundance across sample number ") #makes a more aesthetic plot
#abundance of reads across Locations
p1 = plot_bar(FisheDNA.5, "Location", fill="species")
p1 + geom_bar(aes(color=species, fill=species), stat="identity", position="stack") +
ggtitle("Read abundance across Locations")
#Lets make some relative abundance plots
#First step is to transform to relative abundance
FisheDNA.6 = transform_sample_counts(Fish.eDNA.pa, function(x) x / sum(x) )
FisheDNA.6
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 71 taxa and 59 samples ]
## sample_data() Sample Data: [ 59 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 71 taxa by 8 taxonomic ranks ]
data_fish <- psmelt(FisheDNA.6)
relabundance_plot <- ggplot(data=data_fish, aes(x=SampleNo, y=Abundance, fill=species)) + facet_grid(~Location, scales = "free")
relabundance_plot + geom_bar(aes(), stat="identity", position="fill")
#Lets get these objects out of phyloseq and use them for further analysis
#First, the raw OTU table
Fish.OTU <- as(otu_table(FisheDNA.5), "matrix")
Fish_OTU_df = as.data.frame((Fish.OTU))
Fish_OTU_df= t(Fish_OTU_df)
#now presence-absence
Fish.pa.OTU <- as(otu_table(Fish.eDNA.pa), "matrix")
Fish.pa_OTU_df = as.data.frame((Fish.pa.OTU))
Fish.pa_OTU_df= t(Fish.pa_OTU_df)
Fish_Sample = as(sample_data(Fish.eDNA.pa), "matrix")
Fish_Sample_df = as.data.frame(Fish_Sample)
Fish_taxa <- tax_table(Fish.eDNA.pa)
Fish_taxa_df <- as.data.frame(Fish_taxa)
Fish_PA_OTU_Sample = cbind(Fish_Sample_df, Fish.pa_OTU_df)
One of the first things we would want to look at, is to determine if sufficient sequencing depth was achieved. We can do this by creating rarefaction curves, which work similar to tradtional species accumulation curves, whereby we randomly select sequences from a sample and determine if new species or OTUs are being detected. If the curve flattens out, it gives the indication sufficient sequencing has been conducted to recover most of the diversity in the dataset.
########################
## RAREFACTION CURVES ##
########################
## identify the lowers number of reads for samples and generate rarefaction curves
raremax_df <- min(rowSums(Fish_OTU_df))
rarecurve(Fish_OTU_df, step = 100, sample = 1, col = 'blue', cex = 0.5)
The next step of preliminary data exploration, is to determine what is contained in our data. We can do this by plotting the abundance of each taxon for every sample in a stacked bar plot.
First we are going to look at the overall species richness (number of species) in a sample.
######################
## SPECIES RICHNESS ##
######################
## calculate number of taxa detected per sample and group per sampling location
rich<-estimate_richness(Fish.eDNA.pa, measures=c("Observed"))
rich<-cbind(rich, Fish_Sample)
## test assumptions of statistical test, first normal distribution, next homoscedasticity
histogram(~ Observed | Location, data = rich, layout = c(2,2))
shapiro.test(rich$Observed)
##
## Shapiro-Wilk normality test
##
## data: rich$Observed
## W = 0.94099, p-value = 0.006554
bartlett.test(Observed ~ Location, data = rich)
##
## Bartlett test of homogeneity of variances
##
## data: Observed by Location
## Bartlett's K-squared = 9.4743, df = 3, p-value = 0.02361
So we don’t currently meet the assumptions to run a parametric ANOVA. We will need to use a non-parametric test.
#non-parametric - need to use Kruskal-Wallis test
kruskal.test(Observed ~ Location, data = rich)
##
## Kruskal-Wallis rank sum test
##
## data: Observed by Location
## Kruskal-Wallis chi-squared = 18.245, df = 3, p-value = 0.0003915
boxplot(Observed ~ Location, data = rich, ylab = 'Species Richness', xlab = 'Location')
#make a ggplot boxplot
ggplot(rich, aes(x=Location, y=Observed, colour = Location)) +
geom_boxplot() +
ylab("Observed zotu richness")+
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust=1))
#pairwise test - lets look at the difference in richness
pairwise.wilcox.test(rich$Observed, rich$Location,
p.adjust.method = "BH")
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: rich$Observed and rich$Location
##
## Mudflats Openwater Rocky shore
## Openwater 0.0456 - -
## Rocky shore 0.0183 0.0046 -
## Sandy beach 0.0183 0.0046 0.8410
##
## P value adjustment method: BH
Species richness is significantly different between groups, lower in open water location.
iNEXT (iNterpolation and EXTrapolation) is an R package, available in CRAN and Github, for rarefaction and extrapolation of species diversity (Hill numbers). It’s a useful package to use if you want to make any predictions about sampling size or diversity.
Hill numbers are a set of diversity indices used in ecology and biodiversity assessments to quantify the richness and evenness of species in a given community. These indices were developed by Michael Hill and are widely used to summarize various aspects of biodiversity. Hill numbers are particularly useful because they provide a way to focus on different facets of diversity, depending on the specific questions or goals of a study. For a thorough introduction to Hill numbers check out: Alberdi & Gilbert, A guide to the application of Hill numbers to DNA-based diversity analyses. Molecular Ecology 2019
There are several Hill numbers, but they all share a common formula that relies on an exponent, denoted as “q.” The choice of the exponent determines which aspect of diversity is emphasized:
Richness (q = 0): Hill number with q = 0 is also known as the “species richness” or “taxonomic distinctness” index. It simply counts the number of unique species in a community without considering their abundances. This index is valuable when you want to emphasize the presence or absence of species but not their relative abundance.
Exponential Shannon entropy (q = 1): This index corresponds to the Shannon-Wiener diversity index (commonly referred to as Shannon entropy) when q is set to 1. It considers both the richness and the evenness of species abundances. It is often used when you want to give equal importance to species richness and evenness.
Inverse Simpson index (q = 2): The Inverse Simpson index corresponds to the Simpson diversity index when q is set to 2. This index emphasizes the dominance of the most abundant species in a community. It is often used when you want to give more weight to the presence of dominant species.
Higher Hill numbers (q > 2): These Hill numbers give increasing weight to the most abundant species, placing more emphasis on the dominant species in the community. The larger the value of q, the more sensitive the index becomes to the presence of dominant species.
Hill numbers offer a flexible framework for assessing biodiversity because they allow you to choose the appropriate value of q based on the specific ecological question or management objective you are addressing. For example:
If you want to measure the overall diversity of a community while considering both species richness and evenness, you might use q = 1.
If you are concerned about the impact of a few highly dominant species in an ecosystem, you might use q = 2 or a higher value to focus on those dominant species.
#iNEXT package iNEXT is a neat package that can calculate Hill numbers, enabling us to investigate alpha diversity accumulation curves (similar to species accumulation curves). These graphs can help us determine if sufficient sampling was conducted at each site, based on the plateauing of the curves, similar to the rarefaction curves. But iNEXT.3D goes beyond rarefraction and extrapolates alpha diversity numbers beyond the number of samples collected (default is double sample number).
library(iNEXT.3D)
#inext 3d takes a really weird input
#generate a list of the unique sample locations
categories <- unique(sample_data(Fish.eDNA.pa)$Location)
#create empty list
split_physeq_list <- list()
#fill empty list with a presence absence OTU from each location
for (category in categories) {
sub_physeq.100 <- subset_samples(Fish.eDNA.pa, Location == category)
split_physeq_list[[category]] <- otu_table(sub_physeq.100)
}
#make the lists a matrix of lists
matrix_list <- lapply(split_physeq_list, function(x) {
otu_table <- as(x, "matrix")
return(otu_table)
})
# Create a list named "data" to store matrices
matrix_list <- list(data = list())
# Convert each split phyloseq object into a matrix and store it under "data"
for (category in categories) {
otu_table <- as(otu_table(split_physeq_list[[category]]), "matrix")
matrix_list[["data"]][[category]] <- otu_table
}
#lets have a look at it:
#matrix_list$data
#I told you it was a weird format!
#now we can run iNEXT3D!
#we have our data set up as incidence raw, there is several types you can input (they are all weird)
#https://github.com/KaiHsiangHu/iNEXT.3D
#we are going to calculate the taxonomic diversity, across all three hill numbers, using the default bootstrapping (50) and default endpoint for the extrapolation (double the input sample number)
out.raw <- iNEXT3D(data = matrix_list$data, diversity = 'TD', q = c(0, 1, 2), datatype = 'incidence_raw', nboot = 50)
#there is various ways to look at these plots:
#within a sample
ggiNEXT3D(out.raw, type = 1, facet.var = 'Assemblage') + facet_wrap(~Assemblage, nrow = 3)
#across samples
ggiNEXT3D(out.raw, type = 1, facet.var = "Order.q")
#sample completeness
ggiNEXT3D(out.raw, type = 2, facet.var = "Order.q", color.var = "Assemblage")
Ordination is a multivariate analysis which aims to find gradients or changes in species composition across groups. Multivariate species composition can be imagined as samples in multidimensional space, where every species represents its own axis. Because multidimensional space is not easy to display, describe, or even imagine, it is worth reducing it into a few main dimensions while preserving the maximum information. In ordination space, objects that are closer together are more similar, and dissimilar objects are farther.
The choice of ordination methods depends on 1) the type of data you have, 2) the similarity distance matrix you want/can use, and 3) what you want to say. Different ordination methods use different similarity matrices, and can significantly affect the results. For example, Principle Components Analysis (PCA) will use only Euclidean distance, while Non-metric Multidimensional Scaling (nMDS) or Principle Coordinates Analysis (PCoA) use any similarity distance matrix you want.
So, how to choose a method?
Many other ordination methods exist including RDA (Redundancy Analysis), CAP (Canonical Analysis of Principle coordinates), dbRDA (Distance Cased Redundancy Analysis) (and many more). Some methods will be better than others to show complex community or a specific effect of a factor on your data. For example, CAP will be good to show the effect of the interaction between factors on your community. So sometimes, it is good to try different methods if you are not happy about the results, but keep in mind that these methods are “only” ordination, and you need to perform test for significant differences between groups (e.g. ANOSIM, PERMANOVA).
Often different ordination methods will have different features/characteristics than you will find interesting, such overlay vectors or extra variables, % explained by each axis, 3D plots etc. However, all these details are more software related than truly related to the ordination methods.
For more information about different types of ordination analysis, see this blog: https://www.davidzeleny.net/anadat-r/doku.php/en:ordination or this review article: https://onlinelibrary.wiley.com/doi/full/10.1111/mec.13536
So, now for some examples.
NMDS is a technique that attempts to represent the pairwise dissimilarities or distances between samples in a lower-dimensional space. The goal is to position the samples in such a way that their relative distances in the ordination plot reflect the original dissimilarity structure as closely as possible
NMDS is a non-metric technique, meaning that it does not assume a linear relationship between the original distances and the distances in the ordination space. It focuses on preserving the ranking or order of dissimilarities rather than their precise values.
#NMDS Plot
OTU_Dist <- vegdist(Fish.pa_OTU_df, method="jaccard") #create distance matrix
ord <- metaMDS(OTU_Dist,
distance = "jaccard", #dissimilarity method
k = 2, #number of dimensions
maxit = 999, #max number of iterations
trymax = 500, #max number of random starts - may need to play around with this if you have many zeros in your dataset
wascores = TRUE) #is a method of calculating species scores, default is TRUE
## Run 0 stress 0.1974762
## Run 1 stress 0.1974774
## ... Procrustes: rmse 0.0005026704 max resid 0.003511701
## ... Similar to previous best
## Run 2 stress 0.197483
## ... Procrustes: rmse 0.003231447 max resid 0.01973713
## Run 3 stress 0.19838
## Run 4 stress 0.1983815
## Run 5 stress 0.1974774
## ... Procrustes: rmse 0.002492477 max resid 0.01752266
## Run 6 stress 0.198379
## Run 7 stress 0.197478
## ... Procrustes: rmse 0.002710129 max resid 0.01904769
## Run 8 stress 0.1974753
## ... New best solution
## ... Procrustes: rmse 0.00154876 max resid 0.01089165
## Run 9 stress 0.2349809
## Run 10 stress 0.1974763
## ... Procrustes: rmse 0.0005572154 max resid 0.003916486
## ... Similar to previous best
## Run 11 stress 0.198378
## Run 12 stress 0.1974818
## ... Procrustes: rmse 0.003477703 max resid 0.02031456
## Run 13 stress 0.204615
## Run 14 stress 0.1983797
## Run 15 stress 0.197485
## ... Procrustes: rmse 0.003861686 max resid 0.02037428
## Run 16 stress 0.1974774
## ... Procrustes: rmse 0.0009989505 max resid 0.007019531
## ... Similar to previous best
## Run 17 stress 0.2299323
## Run 18 stress 0.1974818
## ... Procrustes: rmse 0.003172852 max resid 0.01825593
## Run 19 stress 0.1974833
## ... Procrustes: rmse 0.003681975 max resid 0.02036952
## Run 20 stress 0.1983789
## *** Best solution repeated 2 times
ord$stress #stress value
## [1] 0.1974753
plot(ord)
When you run the above code you will get a full result of your NMDS. The NMDS will run to a minimized stress value.
It is common for NMDS analyses to start by running with 2-dimensions (k), but you want to increase the number of dimensions to ensure a minimized stress value. Keep in mind that anything more than 5-dimensions makes it difficult to interpret a 2-dimensional plot.
As a rule of thumb literature has identified the following cut-off values for stress-level:
Higher than 0.2 is poor (risks for false interpretation). 0.1 - 0.2 is fair (some distances can be misleading for interpretation). 0.05 - 0.1 is good (can be confident in inferences from plot). Less than 0.05 is excellent (this can be rare).
One other way to check how well the ordination plots represent real
data is by using the goodness function. You can produce goodness of fit
statistics for each observation (points). You can also use the function
stressplot to create a Shepard diagram displaying two
correlation-like statistics for goodness of fit between ordination
distances and observed dissimilarity. This shows how closely our
ordination fits real world plot dissimilarities and how well we can
interpret the ordination.
# Shepards test/goodness of fit
goodness(ord) # Produces a results of test statistics for goodness of fit for each point
## [1] 0.02441838 0.02706093 0.03309696 0.02294813 0.01459349 0.01634279
## [7] 0.02372183 0.02343284 0.02368286 0.02258004 0.01960201 0.02573182
## [13] 0.02288809 0.02022869 0.01803294 0.02616773 0.02410730 0.01144397
## [19] 0.02163326 0.01663623 0.01300877 0.01673163 0.01822703 0.01405865
## [25] 0.02628627 0.03753405 0.02610843 0.03354501 0.02364699 0.02933737
## [31] 0.03296926 0.02067734 0.02680149 0.02576816 0.02924918 0.03426824
## [37] 0.03317572 0.02850221 0.04036513 0.02906241 0.03373234 0.01991701
## [43] 0.02401955 0.02457265 0.04063613 0.02320178 0.02314318 0.02702782
## [49] 0.03985527 0.02573182 0.01931488 0.01799875 0.02353822 0.02903204
## [55] 0.02469473 0.02052310 0.02978536 0.01875789 0.02307019
stressplot(ord) # Produces a Shepards diagram
The Shepard plot identifies a strong correlation between observed dissimilarity and ordination distance (R2 = 0.828), highlighting a pretty good goodness-of-fit of the NMDS (remember, we want this as close to 1 as possible).
Now lets make a plot that looks nice on ggplot
both.scores = cbind(scores(ord), Fish_Sample_df)
ggplot(both.scores) +
geom_point( aes(x=NMDS1,y=NMDS2, colour = Location),size=2) +
stat_ellipse(aes(x=NMDS1, y=NMDS2, colour= Location))+
coord_equal() +
ggtitle("NMDS") +
theme_classic()
You can also do this on phyloseq
FisheDNA.ord <- ordinate(Fish.eDNA.pa, "NMDS", "jaccard")
## Run 0 stress 0.1974762
## Run 1 stress 0.197483
## ... Procrustes: rmse 0.004453985 max resid 0.02005521
## Run 2 stress 0.1974781
## ... Procrustes: rmse 0.002702307 max resid 0.0189945
## Run 3 stress 0.1974807
## ... Procrustes: rmse 0.003488993 max resid 0.02002853
## Run 4 stress 0.1974827
## ... Procrustes: rmse 0.003286805 max resid 0.02011061
## Run 5 stress 0.1974848
## ... Procrustes: rmse 0.004770948 max resid 0.02318291
## Run 6 stress 0.1983778
## Run 7 stress 0.2414583
## Run 8 stress 0.2287742
## Run 9 stress 0.1974833
## ... Procrustes: rmse 0.004490947 max resid 0.02042435
## Run 10 stress 0.1983785
## Run 11 stress 0.2287722
## Run 12 stress 0.1983783
## Run 13 stress 0.1974815
## ... Procrustes: rmse 0.004048733 max resid 0.01996547
## Run 14 stress 0.2302036
## Run 15 stress 0.2255238
## Run 16 stress 0.1974756
## ... New best solution
## ... Procrustes: rmse 0.001745731 max resid 0.01223529
## Run 17 stress 0.1983806
## Run 18 stress 0.1974769
## ... Procrustes: rmse 0.0006246905 max resid 0.004362772
## ... Similar to previous best
## Run 19 stress 0.1974837
## ... Procrustes: rmse 0.003590057 max resid 0.01990643
## Run 20 stress 0.1974824
## ... Procrustes: rmse 0.003435316 max resid 0.01987814
## *** Best solution repeated 1 times
FisheDNA.ord
##
## Call:
## metaMDS(comm = veganifyOTU(physeq), distance = distance)
##
## global Multidimensional Scaling using monoMDS
##
## Data: veganifyOTU(physeq)
## Distance: jaccard
##
## Dimensions: 2
## Stress: 0.1974756
## Stress type 1, weak ties
## Best solution was repeated 1 time in 20 tries
## The best solution was from try 16 (random start)
## Scaling: centring, PC rotation, halfchange scaling
## Species: expanded scores based on 'veganifyOTU(physeq)'
#
NMDSphylo = plot_ordination(Fish.eDNA.pa, FisheDNA.ord, type="samples", color="Location")
NMDSphylo + stat_ellipse(aes(x=NMDS1, y=NMDS2, colour= Location))+
geom_point(size=2) + coord_equal() +
ggtitle("NMDS phyloseq") +
theme_classic()
Looks like the open water samples are doing something different here than they were in the Presence-absence data. May be worth checking comparing them.
Principal coordinates analysis (PCoA) is also known as classical or metric MDS (Multidimensional Scaling). It aims to preserve the original pairwise distances as closely as possible while transforming the data into a lower-dimensional space. PCoA is a metric technique, which assumes a linear relationship between the original distances and the distances in the ordination space. It aims to maintain the actual distances as much as possible.
If the distance measure is Euclidean, PCoA is identical to PCA (principal components analysis)
#PCoA
ordination_mds <- wcmdscale(OTU_Dist, eig = TRUE)
pcoa_df <- data.frame(ordination_mds$points)
colnames(pcoa_df) <- c("PCo1", "PCo2")
pcoa_df$Location <- factor(Fish_Sample_df$Location) #add group of interest, mine was Location data
percent_explained <- 100 * ordination_mds$eig /sum(ordination_mds$eig)
pretty_pe <- round(percent_explained[1:2], digits = 1)
pretty_pe
## [1] 16.0 11.6
labs <- c(glue("PCo1 ({pretty_pe[1]}%)"),
glue("PCo2 ({pretty_pe[2]}%)"))
ggplot(pcoa_df, aes(x = PCo1, y = PCo2, color = Location)) +
geom_point(size = 2) +
stat_ellipse(aes(x = PCo1, y = PCo2, color = Location)) +
ggtitle("PCOA") +
theme_classic() +
labs(x=labs[1], y=labs[2])
ordination_eigen <- ordination_mds$eig
ordination_eigenvalue <- ordination_eigen/sum(ordination_eigen)
ordination_eigen_frame <- data.frame(Inertia = ordination_eigenvalue*100, Axes = c(1:57)) #here we can see the axes we will be using and the eigen values
head(ordination_eigen_frame)
## Inertia Axes
## 1 16.016699 1
## 2 11.634903 2
## 3 9.250295 3
## 4 7.108934 4
## 5 5.875497 5
## 6 5.592366 6
eigenplot <- ggplot(data = ordination_eigen_frame, aes(x = factor(Axes), y = Inertia)) +
geom_bar(stat = "identity") +
scale_y_continuous(expand = c(0,0), limits = c(0, 50)) +
theme_classic() +
xlab("Axes") +
ylab("Inertia %") +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank())
eigenplot
ordination_mds$GOF #goodness of fit
## [1] 0.9734781 1.0000000
In PCoA, the eigenvalues represent the amount of variance explained by each axis (dimension) in the ordination plot. The first eigenvalue explains the most variance, the second eigenvalue explains the second most, and so on. The larger the eigenvalue, the more important that axis is in capturing the variability in the original data. So when displaying a plot in 2 dimensions, its important to look at how much of the variation in your data is explained in those two dimensions (Eigenvalue 1 and Eigenvalue 2).
In our example data: Axes 1 = 16.02% Axes 2 = 11.63%
Total = 27.65% of the variation explained in the PCoA plot
Goodness of Fit is also worth looking at - you want to maximise these values.
You can also do this in phyloseq
FisheDNA.PCoA = ordinate(Fish.eDNA.pa, "PCoA", "jaccard", weighted=TRUE)
PCoAphylo = plot_ordination(Fish.eDNA.pa, FisheDNA.PCoA, type="samples", color="Location")
PCoAphylo + stat_ellipse(aes(x=Axis.1, y=Axis.2, colour= Location))+
geom_point(size=2) + coord_equal() +
ggtitle("PCoA phyloseq") +
theme_classic()
We can also fit some environmental vectors to our plots using the
envfit function. In the interest of time I will be showing
this on the PCoA plot, but you can also use the ordination from the NMDS
and see what it generates.
Note, it is not advisable to run this on constrained analysis, as you would be looking at overinflated correlations. Works well with NMDS or the PCoA we are using today.
We will be looking at a four Vectors (number only values): - Temperature (Temperature) - Dissolved oxygen (Diss.O) - Latitude (Lat) - Longitude (Long)
And 1 Factor: - Habitat type (Location)
#the sample numbers removed - makes interpretation easier....
meta_env <- meta %>% select(-SampleNo)
en = envfit(ordination_mds, meta_env, permutations = 999, na.rm = TRUE)
en
##
## ***VECTORS
##
## Dim1 Dim2 r2 Pr(>r)
## Temperature -0.98044 0.19682 0.0887 0.067 .
## Diss.O -0.92447 0.38126 0.0489 0.239
## Lat 0.58845 0.80853 0.1587 0.012 *
## Long 0.95376 0.30058 0.1516 0.009 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation: free
## Number of permutations: 999
##
## ***FACTORS:
##
## Centroids:
## Dim1 Dim2
## LocationMudflats 0.0533 -0.0356
## LocationOpenwater 0.3548 0.0092
## LocationRocky shore -0.0015 0.1991
## LocationSandy beach -0.1382 -0.1177
##
## Goodness of fit:
## r2 Pr(>r)
## Location 0.3443 0.001 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation: free
## Number of permutations: 999
{ordiplot(ordination_mds, display = 'sites')
plot(en, p.max = 0.05, col = "red")}
It looks like the habitat type and the latitude and longitude is the biggest driver in community composition of the fish (significant correlations with the ordination scores) - which makes sense when we think about the fish ecology within the region. Note that because latitude and longitude are vectors, we are getting a correlation added onto the plot.
Could we show this data in a non-linear way?
ordisurf(ordination_mds, meta_env[, 'Lat'], main = 'Latitude', display = 'sites' )
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
##
## Estimated degrees of freedom:
## 4.39 total = 5.39
##
## REML score: -226.8017
Now we will test whether the patterns we observed in the nMDS or the PCoA plot are actually statistically significant.
PERmutational Multivariate ANalysis of VAriance (PERMANOVA) is a permutation-based technique – it makes no distributional assumptions about multivariate normality or homogeneity of variances. This means it is the non-parametric equivalent of Multivariate Analysis of Variance (MANOVA). The main goal of PERMANOVA is to determine whether there are statistically significant differences between the groups based on the multivariate data, while accounting for the inherent variability within each group.
The PERMANOVA procedure works by using permutation testing. Here’s a basic outline of how it works:
Data Setup: You have a dataset with multiple variables (multivariate) and multiple groups or conditions.
Dissimilarity Matrix: A dissimilarity matrix is created based on the multivariate data. This matrix quantifies the dissimilarity or distance between individual observations within the dataset (here you could pick what matrix you would like to use, in our example we are using a Jaccard matrix as we are using Presence-Absence data).
Permutation Testing: PERMANOVA works by shuffling the group labels (permuting) and then recalculating the dissimilarity matrix and relevant statistical tests. This process is repeated many times to create a distribution of test statistics under the assumption that the group labels have no effect on the data.
Comparison to Observed Data: The observed test statistic (usually a sum of squares or sums of squared distances) is then compared to the distribution of test statistics generated through permutation testing. This comparison helps determine the likelihood of observing the observed test statistic if the group labels have no effect.
P-Value Calculation: The p-value is calculated as the proportion of permuted test statistics that are more extreme than the observed test statistic. A low p-value indicates that the observed differences between groups are unlikely to occur by random chance alone.
Interpretation: If the p-value is below a pre-defined significance level (e.g., 0.05), you can then conclude that there are statistically significant differences between the groups based on the multivariate data.
#PERMANOVA
adonis2(OTU_Dist ~ Location, method = "jaccard", data = Fish_Sample_df, permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = OTU_Dist ~ Location, data = Fish_Sample_df, permutations = 9999, method = "jaccard")
## Df SumOfSqs R2 F Pr(>F)
## Location 3 3.538 0.17173 3.8013 1e-04 ***
## Residual 55 17.064 0.82827
## Total 58 20.602 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
You would need to report the df, F value, P value (Pr(>F)) in the manuscript.
Determining the occurrence or abundance of a small set of indicator species, as an alternative to sampling the entire community, has been particularly useful in longterm environmental monitoring for conservation or ecological management. Species are chosen as indicators if they:
Our previous analysis has shown us that there are differences between
the groups, and I would like to look at indicator species as potentially
what species could be responsible for those differences between the
groups. We will be using the indicspecies package to look
at the indicator species for each location type.
Indicator species are often determined using an analysis of the relationship between the species occurrence or abundance values from a set of sampled sites and the classification of the same sites into site groups, which may represent habitat types, community types, disturbance states, etc. Thus, there are two data elements in an indicator species analysis: (1) the community data matrix; and (2) the vector that describes the classification of sites into groups.
Important to note for this package to work you need sites in rows and species in columns.
indval <- multipatt(Fish.pa_OTU_df, Fish_Sample_df$Location,
control = how(nperm=999))
summary(indval)
##
## Multilevel pattern analysis
## ---------------------------
##
## Association function: IndVal.g
## Significance level (alpha): 0.05
##
## Total number of species: 71
## Selected number of species: 17
## Number of species associated to 1 group: 15
## Number of species associated to 2 groups: 2
## Number of species associated to 3 groups: 0
##
## List of species associated to each combination:
##
## Group Mudflats #sps. 1
## stat p.value
## zotu.5 0.611 0.016 *
##
## Group Rocky shore #sps. 9
## stat p.value
## zotu.10 0.617 0.017 *
## zotu.17 0.596 0.016 *
## zotu.23 0.577 0.005 **
## zotu.66 0.577 0.002 **
## zotu.56 0.563 0.015 *
## zotu.14 0.516 0.033 *
## zotu.22 0.447 0.038 *
## zotu.59 0.447 0.026 *
## zotu.77 0.447 0.031 *
##
## Group Sandy beach #sps. 5
## stat p.value
## zotu.2 0.753 0.001 ***
## zotu.6 0.738 0.001 ***
## zotu.15 0.671 0.005 **
## zotu.94 0.592 0.009 **
## zotu.74 0.589 0.016 *
##
## Group Mudflats+Rocky shore #sps. 2
## stat p.value
## zotu.8 0.582 0.044 *
## zotu.20 0.514 0.023 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tax_table(Fish.eDNA.pa)
## Taxonomy Table: [71 taxa by 8 taxonomic ranks]:
## root kingdom phylum class order
## zotu.1 "Root" "Eukaryota" "Chordata" "Actinopteri" "Scombriformes"
## zotu.10 "Root" "Eukaryota" "Chordata" "Actinopteri" "Perciformes"
## zotu.11 "Root" "Eukaryota" "Chordata" "Actinopteri" NA
## zotu.12 "Root" "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes"
## zotu.13 "Root" "Eukaryota" "Chordata" "Actinopteri" "Blenniiformes"
## zotu.14 "Root" NA NA NA NA
## zotu.15 "Root" "Eukaryota" "Chordata" "Actinopteri" "Myctophiformes"
## zotu.16 "Root" "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes"
## zotu.17 "Root" NA NA NA NA
## zotu.18 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.2 "Root" "Eukaryota" "Chordata" "Actinopteri" "Myctophiformes"
## zotu.20 "Root" "Eukaryota" "Chordata" "Actinopteri" "Blenniiformes"
## zotu.22 "Root" NA NA NA NA
## zotu.23 "Root" "Eukaryota" "Chordata" "Actinopteri" "Blenniiformes"
## zotu.24 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.25 "Root" "Eukaryota" "Chordata" "Actinopteri" "Perciformes"
## zotu.27 "Root" NA NA NA NA
## zotu.28 "Root" "Eukaryota" "Chordata" "Actinopteri" "Gadiformes"
## zotu.3 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.30 "Root" NA NA NA NA
## zotu.31 "Root" "Eukaryota" "Chordata" "Actinopteri" NA
## zotu.32 "Root" "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes"
## zotu.34 "Root" "Eukaryota" "Chordata" "Actinopteri" "Salmoniformes"
## zotu.39 "Root" "Eukaryota" "Chordata" "Actinopteri" "Blenniiformes"
## zotu.4 "Root" "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes"
## zotu.41 "Root" NA NA NA NA
## zotu.42 "Root" NA NA NA NA
## zotu.43 "Root" "Eukaryota" "Chordata" "Actinopteri" "Perciformes"
## zotu.44 "Root" "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes"
## zotu.47 "Root" "Eukaryota" "Chordata" "Actinopteri" "Myctophiformes"
## zotu.48 "Root" NA NA NA NA
## zotu.5 "Root" "Eukaryota" "Chordata" "Actinopteri" "Blenniiformes"
## zotu.51 "Root" "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes"
## zotu.53 "Root" "Eukaryota" "Chordata" "Actinopteri" "Mugiliformes"
## zotu.54 "Root" "Eukaryota" "Chordata" "Actinopteri" "Gadiformes"
## zotu.56 "Root" "Eukaryota" "Chordata" "Actinopteri" "Perciformes"
## zotu.59 "Root" NA NA NA NA
## zotu.6 "Root" "Eukaryota" "Chordata" "Actinopteri" "Myctophiformes"
## zotu.60 "Root" "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes"
## zotu.61 "Root" "Eukaryota" "Chordata" "Actinopteri" "Perciformes"
## zotu.62 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.63 "Root" "Eukaryota" "Chordata" "Actinopteri" NA
## zotu.65 "Root" "Eukaryota" "Chordata" "Chondrichthyes" "Rajiformes"
## zotu.66 "Root" NA NA NA NA
## zotu.69 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.7 "Root" "Eukaryota" "Chordata" "Actinopteri" "Mugiliformes"
## zotu.70 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.71 "Root" "Eukaryota" "Chordata" "Actinopteri" NA
## zotu.72 "Root" NA NA NA NA
## zotu.74 "Root" "Eukaryota" "Chordata" "Actinopteri" "Myctophiformes"
## zotu.75 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.76 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.77 "Root" "Eukaryota" "Chordata" "Actinopteri" "Blenniiformes"
## zotu.78 "Root" NA NA NA NA
## zotu.79 "Root" "Eukaryota" "Chordata" "Actinopteri" "Myctophiformes"
## zotu.8 "Root" "Eukaryota" "Chordata" "Actinopteri" "Blenniiformes"
## zotu.80 "Root" NA NA NA NA
## zotu.81 "Root" NA NA NA NA
## zotu.82 "Root" NA NA NA NA
## zotu.83 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.84 "Root" "Eukaryota" "Chordata" "Actinopteri" NA
## zotu.85 "Root" NA NA NA NA
## zotu.86 "Root" "Eukaryota" "Chordata" "Actinopteri" "Labriformes"
## zotu.88 "Root" NA NA NA NA
## zotu.9 "Root" "Eukaryota" "Chordata" "Actinopteri" "Centrarchiformes"
## zotu.90 "Root" "Eukaryota" "Chordata" "Actinopteri" NA
## zotu.91 "Root" "Eukaryota" "Chordata" "Actinopteri" "Perciformes"
## zotu.92 "Root" "Eukaryota" "Chordata" "Actinopteri" NA
## zotu.93 "Root" NA NA NA NA
## zotu.94 "Root" "Eukaryota" "Chordata" "Actinopteri" "Myctophiformes"
## zotu.95 "Root" NA NA NA NA
## family genus species
## zotu.1 "Gempylidae" "Thyrsites" "Thyrsites_atun"
## zotu.10 "Bovichtidae" "Bovichtus" "Bovichtus_variegatus"
## zotu.11 NA NA NA
## zotu.12 "Clupeidae" "Sprattus" "Sprattus_antipodum"
## zotu.13 "Tripterygiidae" "Forsterygion" "Forsterygion_lapillum"
## zotu.14 NA NA NA
## zotu.15 "Myctophidae" "Lampichthys" "Lampichthys_procerus"
## zotu.16 "Clupeidae" "Sprattus" NA
## zotu.17 NA NA NA
## zotu.18 "Labridae" "Notolabrus" "Notolabrus_celidotus"
## zotu.2 "Myctophidae" "Lampichthys" "Lampichthys_procerus"
## zotu.20 "Tripterygiidae" "Forsterygion" "Forsterygion_lapillum"
## zotu.22 NA NA NA
## zotu.23 "Tripterygiidae" "Forsterygion" NA
## zotu.24 "Odacidae" "Odax" "Odax_pullus"
## zotu.25 "Nototheniidae" "Notothenia" "Notothenia_angustata"
## zotu.27 NA NA NA
## zotu.28 "Moridae" "Pseudophycis" "Pseudophycis_bachus"
## zotu.3 "Labridae" "Pseudolabrus" "Pseudolabrus_miles"
## zotu.30 NA NA NA
## zotu.31 NA NA NA
## zotu.32 "Clupeidae" "Sprattus" "Sprattus_muelleri"
## zotu.34 "Salmonidae" "Salmo" NA
## zotu.39 "Tripterygiidae" "Forsterygion" "Forsterygion_lapillum"
## zotu.4 "Clupeidae" "Sprattus" "Sprattus_antipodum"
## zotu.41 NA NA NA
## zotu.42 NA NA NA
## zotu.43 "Nototheniidae" "Notothenia" "Notothenia_angustata"
## zotu.44 "Clupeidae" "Sprattus" "Sprattus_muelleri"
## zotu.47 "Myctophidae" NA NA
## zotu.48 NA NA NA
## zotu.5 "Tripterygiidae" "Forsterygion" NA
## zotu.51 "Clupeidae" "Sprattus" "Sprattus_muelleri"
## zotu.53 "Mugilidae" "Aldrichetta" "Aldrichetta_forsteri"
## zotu.54 "Moridae" "Lotella" "Lotella_rhacina"
## zotu.56 "Bovichtidae" "Bovichtus" "Bovichtus_variegatus"
## zotu.59 NA NA NA
## zotu.6 "Myctophidae" "Lampichthys" "Lampichthys_procerus"
## zotu.60 "Clupeidae" "Sprattus" "Sprattus_muelleri"
## zotu.61 "Bovichtidae" "Bovichtus" "Bovichtus_variegatus"
## zotu.62 "Labridae" "Pseudolabrus" NA
## zotu.63 NA NA NA
## zotu.65 "Rajidae" "Dipturus" NA
## zotu.66 NA NA NA
## zotu.69 "Labridae" NA NA
## zotu.7 "Mugilidae" "Aldrichetta" "Aldrichetta_forsteri"
## zotu.70 "Labridae" "Notolabrus" "Notolabrus_celidotus"
## zotu.71 NA NA NA
## zotu.72 NA NA NA
## zotu.74 "Myctophidae" "Lampichthys" "Lampichthys_procerus"
## zotu.75 "Labridae" NA NA
## zotu.76 "Labridae" NA NA
## zotu.77 "Tripterygiidae" "Forsterygion" NA
## zotu.78 NA NA NA
## zotu.79 "Myctophidae" "Lampichthys" "Lampichthys_procerus"
## zotu.8 "Tripterygiidae" "Forsterygion" "Forsterygion_lapillum"
## zotu.80 NA NA NA
## zotu.81 NA NA NA
## zotu.82 NA NA NA
## zotu.83 "Labridae" NA NA
## zotu.84 NA NA NA
## zotu.85 NA NA NA
## zotu.86 "Labridae" NA NA
## zotu.88 NA NA NA
## zotu.9 "Latridae" "Latridopsis" "Latridopsis_forsteri"
## zotu.90 NA NA NA
## zotu.91 "Bovichtidae" "Bovichtus" "Bovichtus_variegatus"
## zotu.92 NA NA NA
## zotu.93 NA NA NA
## zotu.94 "Myctophidae" NA NA
## zotu.95 NA NA NA
Showing all significant (a<0.05) taxa that are responsible for characterising different habitat types.
What do these results mean?
Looks like there are a few taxa that are strongly associated with Open water. For example, zotu 5, was identified as some species of triple fin (Forsterygion), is found in 61.1% (stat = 0.611) of all the site in the Mudflats location type.
Out of 999 permutations, this taxa was a significant taxa in indicating a Mudflats location.
Most eDNA papers will include a map of the datapoints. It relatively straight forward to plot these in R and there is various options for this including leaflet, plotly and ggplot. We are going to use leaflet today, though I have also included some code for plotly which is a bit more flexible for design etc but with that flexibility comes complexity. More details of leaflet maps can be found here: https://rstudio.github.io/leaflet/
#leaflet
##packages
library(leaflet)
library(htmltools)
library(tidyverse)
library(RColorBrewer)
#load the metadata file
#this must include the longitude and latitude data + whatever data you wish to display
meta_data_map<-read.table(file='meta_data_maps.csv',header=T,sep=',', stringsAsFactors = F)
meta_data_map
## Sample_ID Replication_per_site Sample_name Sample_type Latitude Longitude
## 1 SB 1 5 Sandy beach 1 Sandy beach -45.77114 170.7009
## 2 SB 2 5 Sandy beach 2 Sandy beach -45.77366 170.7049
## 3 SB 3 5 Sandy beach 3 Sandy beach -45.78189 170.7126
## 4 SB 4 5 Sandy beach 4 Sandy beach -45.78342 170.7174
## 5 RS 1 5 Rocky shore 1 Rocky shore -45.77357 170.7154
## 6 RS 2 5 Rocky shore 2 Rocky shore -45.77068 170.7196
## 7 RS 3 5 Rocky shore 3 Rocky shore -45.77618 170.7119
## 8 MF 1 5 Mudflats 1 Mudflats -45.78304 170.7131
## 9 MF 2 5 Mudflats 2 Mudflats -45.78234 170.7114
## 10 MF 3 5 Mudflats 3 Mudflats -45.78291 170.7107
## 11 OW 1 5 Openwater 1 Openwater -45.77540 170.7758
## 12 OW 2 5 Openwater 2 Openwater -45.77277 170.8170
#here is basic map using leaflet
#most maps in R work in a layering manner so its easier to think of it as adding a layer on to a map
#the icons are clickable
#here I am loading the data and map then adding makers at each long/lat that have a pop up of the sample ID, sample name and replication per site
leaflet(meta_data_map) %>% addTiles() %>%
addMarkers(~Longitude, ~Latitude, popup = ~paste(Sample_ID, Sample_name, paste('N reps:', Replication_per_site,sep =' '), sep = "<br>"))
#we can make it a bit prettier by colouring things by sample type
#set your colour palets to have the same number of colours as you have samples (Im using Rcolourbrewer package here to make this easeir)
pal <- colorFactor(
palette = 'Dark2',
domain = meta_data_map$Sample_type
)
#nicer map: we are adding in a legend, changing the markers and adding some permanent label texts so it can be exported as a figure.
leaflet(meta_data_map) %>%
addTiles() %>%
addCircleMarkers(~Longitude, ~Latitude,
popup = ~paste(Sample_name, paste('# reps:', Replication_per_site,sep =' '), sep = "<br>"),
label = ~ as.character(Sample_type),
color = ~ pal(Sample_type),
radius = ~ sqrt(Replication_per_site)+4,
stroke = FALSE, fillOpacity = 1) %>%
addLegend("bottomright", pal = pal, values = ~Sample_type,
title = "Sample type",
opacity = 1) %>%
addLabelOnlyMarkers(~Longitude, ~Latitude,
label = ~ as.character(Sample_ID),labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T))
#unfortunately in my 2 minute google I could not find a way to implement a version of ggrepel in leaflet so the labels overlap - this is probably possible in ggplot
Im also going to include some code here for plotly which is a nice package for figures - conveniently there is a python and R version of plotly though you will obviously have to change the code for python. Maps in plotly work by you getting an account with mapbox and generating a authenticator key (this is free): https://docs.mapbox.com/help/getting-started/access-tokens/ More details on plotly maps can be found here: https://plotly.com/r/maps/ Below is some code to make a map this way, it is more flexible than leaflet but also a bit more tricky.
##plotly maps:
#Sys.setenv('MAPBOX_TOKEN' ='xxxyourkeyxxx')
##like leaflet these figures are layered on top
#map<-plot_mapbox(meta_data_map) %>%
# add_markers(
# x = ~Longitude,
# y = ~Latitude,
# size = ~Replication_per_site,
# color = ~Sample_type,
# colors = "Accent",
# text = ~paste(Sample_ID, Sample_name),
# hoverinfo = "text"
# )
##plotly defaults to Cameroon so you will have to go find NZ
#map
##add in a legend
#map<-map %>% layout(title = 'Sampling locations',
# legend = list(orientation = 'h',
# font = list(size = 8)),
# margin = list(l = 25, r = 25,
# b = 25, t = 25,
# pad = 2))
#map
##or we could turn it dark
#map<-map %>% layout(title = 'Sampling locations',
# font = list(color='white'),
# plot_bgcolor = '#191A1A', paper_bgcolor = '#191A1A',
# mapbox = list(style = 'dark'),
# legend = list(orientation = 'h',
# font = list(size = 8)),
# margin = list(l = 25, r = 25,
# b = 25, t = 25,
# pad = 2))
#map